The Thera bank recently saw a steep decline in the number of users of their credit card. Credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances. Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas. As a data scientist at Thera bank there is a need to come up with a classification model that will predict if the customer is going to churn or not. This will help the bank improve its services so that customers do not renounce their credit cards. There is a need to identify the best possible model that will give the required performance.
# To help with reading and manipulating data
import pandas as pd
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_blackG
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
# Loading the dataset
creditcardDF = pd.read_csv("D:\\course\\BankChurners.CSV")
print(f'There are {creditcardDF.shape[0]} rows and {creditcardDF.shape[1]} columns.') # f-string
np.random.seed(2) # setting the random seed via np.random.seed to get the same random result
creditcardDF.sample(n=10) # 10 random rows
There are 10127 rows and 21 columns.
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7862 | 711589758 | Existing Customer | 42 | M | 5 | High School | Married | $60K - $80K | Blue | 32 | 1 | 2 | 2 | 4162.000 | 1641 | 2521.000 | 0.719 | 3967 | 75 | 0.923 | 0.394 |
| 3536 | 717886008 | Existing Customer | 63 | M | 1 | High School | Single | Less than $40K | Blue | 51 | 5 | 3 | 2 | 6102.000 | 0 | 6102.000 | 0.639 | 4527 | 67 | 0.558 | 0.000 |
| 9770 | 720852108 | Existing Customer | 46 | M | 4 | High School | Divorced | $120K + | Blue | 31 | 2 | 3 | 1 | 3569.000 | 1553 | 2016.000 | 0.798 | 16695 | 116 | 0.681 | 0.435 |
| 8909 | 719785683 | Existing Customer | 41 | M | 4 | Post-Graduate | Married | $80K - $120K | Blue | 36 | 1 | 2 | 2 | 21751.000 | 1573 | 20178.000 | 0.878 | 8332 | 102 | 0.569 | 0.072 |
| 709 | 780054258 | Existing Customer | 40 | M | 5 | Graduate | NaN | $40K - $60K | Blue | 25 | 6 | 2 | 3 | 7860.000 | 541 | 7319.000 | 0.764 | 1367 | 35 | 0.750 | 0.069 |
| 975 | 717180633 | Existing Customer | 59 | M | 1 | Post-Graduate | Married | $40K - $60K | Blue | 36 | 4 | 2 | 2 | 2483.000 | 1372 | 1111.000 | 1.642 | 1704 | 35 | 0.458 | 0.553 |
| 32 | 709029408 | Existing Customer | 41 | M | 4 | Graduate | Married | $60K - $80K | Blue | 36 | 4 | 1 | 2 | 8923.000 | 2517 | 6406.000 | 1.726 | 1589 | 24 | 1.667 | 0.282 |
| 9454 | 708510858 | Existing Customer | 60 | F | 1 | Uneducated | Single | Less than $40K | Blue | 47 | 1 | 3 | 2 | 4905.000 | 2413 | 2492.000 | 0.853 | 15478 | 109 | 0.730 | 0.492 |
| 4548 | 781297983 | Existing Customer | 58 | M | 2 | NaN | Divorced | $60K - $80K | Blue | 52 | 6 | 2 | 3 | 20410.000 | 1196 | 19214.000 | 0.726 | 3525 | 78 | 0.733 | 0.059 |
| 9351 | 789983133 | Existing Customer | 31 | M | 2 | Graduate | NaN | $80K - $120K | Silver | 22 | 2 | 3 | 2 | 34516.000 | 1780 | 32736.000 | 0.839 | 14185 | 98 | 0.690 | 0.052 |
# Checking the number of rows and columns in the data
creditcardDF.shape
(10127, 21)
customerdata = creditcardDF.copy()
# let's view the first 5 rows of the data
customerdata.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
# let's view the last 5 rows of the data
customerdata.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
# let's check the data types of the columns in the dataset
customerdata.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
# let's check for duplicate values in the data
customerdata.duplicated().sum()
0
# let's check for missing values in the data
round(customerdata.isnull().sum() / customerdata.isnull().count() * 100, 2)
CLIENTNUM 0.000 Attrition_Flag 0.000 Customer_Age 0.000 Gender 0.000 Dependent_count 0.000 Education_Level 15.000 Marital_Status 7.400 Income_Category 0.000 Card_Category 0.000 Months_on_book 0.000 Total_Relationship_Count 0.000 Months_Inactive_12_mon 0.000 Contacts_Count_12_mon 0.000 Credit_Limit 0.000 Total_Revolving_Bal 0.000 Avg_Open_To_Buy 0.000 Total_Amt_Chng_Q4_Q1 0.000 Total_Trans_Amt 0.000 Total_Trans_Ct 0.000 Total_Ct_Chng_Q4_Q1 0.000 Avg_Utilization_Ratio 0.000 dtype: float64
Education_Level column has 15% missing values out of the total observations.Marital_Status column has 7% missing values out of the total observations.# Checking for the null value in the dataset
customerdata.isna().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
Let's check the number of unique values in each column
customerdata.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
# let's view the statistical summary of the numerical columns in the data
customerdata.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.000 | 739177606.334 | 36903783.450 | 708082083.000 | 713036770.500 | 717926358.000 | 773143533.000 | 828343083.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.000 | 2.346 | 1.299 | 0.000 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.000 | 3.813 | 1.554 | 1.000 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
Checking the value count for each category of categorical variables
# Making a list of all catrgorical variables
cat_col = [
"Attrition_Flag",
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# Printing number of count of each unique value in each column
for column in cat_col:
print(customerdata[column].value_counts())
print("-" * 40)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 ---------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 ---------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 ---------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 ---------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 ---------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 ----------------------------------------
# Changing abc category to the mod value for the income category coumn which is less than 40K
customerdata['Income_Category'] = customerdata['Income_Category'].replace(['abc'],'Less than $40K')
print(customerdata.Income_Category.value_counts())
Less than $40K 4673 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64
# Dropping the clientnum column since it has no impact on the attritionflag
customerdata.drop(['CLIENTNUM'],axis=1, inplace=True)
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# Observations on Customer_age
histogram_boxplot(customerdata, "Customer_Age")
histogram_boxplot(customerdata, "Months_on_book")
histogram_boxplot(customerdata, "Total_Relationship_Count")
histogram_boxplot(customerdata, "Months_Inactive_12_mon")
histogram_boxplot(customerdata, "Contacts_Count_12_mon")
histogram_boxplot(customerdata, "Credit_Limit")
histogram_boxplot(customerdata, "Total_Revolving_Bal")
histogram_boxplot(customerdata, "Avg_Open_To_Buy")
histogram_boxplot(customerdata, "Total_Trans_Amt")
histogram_boxplot(customerdata, "Total_Trans_Ct")
histogram_boxplot(customerdata, "Total_Ct_Chng_Q4_Q1")
histogram_boxplot(customerdata, "Total_Amt_Chng_Q4_Q1")
histogram_boxplot(customerdata, "Avg_Utilization_Ratio")
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# observations on Attrition flag
labeled_barplot(customerdata, "Attrition_Flag")
# observations on Sex
labeled_barplot(customerdata, "Gender")
# observations on Dependant count
labeled_barplot(customerdata, "Dependent_count")
# observations on Education level
labeled_barplot(customerdata, "Education_Level")
# observations on Marital_Status
labeled_barplot(customerdata, "Marital_Status")
# observations on Income category
labeled_barplot(customerdata, "Income_Category")
# observations on Card category
labeled_barplot(customerdata, "Card_Category")
sns.pairplot(customerdata, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x1d4adbba910>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Customer_Age", data=customerdata, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Customer_Age'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Dependent_count", data=customerdata, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Dependent_count'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Months_on_book", data=customerdata, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Months_on_book'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Relationship_Count", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Relationship_Count'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Months_Inactive_12_mon", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Months_Inactive_12_mon'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Contacts_Count_12_mon", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Contacts_Count_12_mon'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Credit_Limit", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Credit_Limit'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Revolving_Bal", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Revolving_Bal'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Avg_Open_To_Buy", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Avg_Open_To_Buy'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Amt_Chng_Q4_Q1", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Amt_Chng_Q4_Q1'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Amt", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Trans_Amt'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Ct", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Trans_Ct'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Ct_Chng_Q4_Q1", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Ct_Chng_Q4_Q1'>
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Avg_Utilization_Ratio", data=customerdata)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Avg_Utilization_Ratio'>
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(customerdata, "Gender", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(customerdata, "Education_Level", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Education_Level All 1371 7237 8608 Graduate 487 2641 3128 High School 306 1707 2013 Uneducated 237 1250 1487 College 154 859 1013 Doctorate 95 356 451 Post-Graduate 92 424 516 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(customerdata, "Marital_Status", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Marital_Status All 1498 7880 9378 Married 709 3978 4687 Single 668 3275 3943 Divorced 121 627 748 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(customerdata, "Income_Category", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Income_Category All 1627 8500 10127 Less than $40K 799 3874 4673 $40K - $60K 271 1519 1790 $80K - $120K 242 1293 1535 $60K - $80K 189 1213 1402 $120K + 126 601 727 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(customerdata, "Card_Category", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Card_Category All 1627 8500 10127 Blue 1519 7917 9436 Silver 82 473 555 Gold 21 95 116 Platinum 5 15 20 ------------------------------------------------------------------------------------------------------------------------
plt.figure(figsize=(15, 7))
sns.heatmap(customerdata.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# outlier detection using boxplot
numerical_col = customerdata.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(20,30))
for i, variable in enumerate(numerical_col):
plt.subplot(5,4,i+1)
plt.boxplot(customerdata[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
def treat_outliers(data,col):
'''
treats outliers in a varaible
col: str, name of the numerical varaible
data: data frame
col: name of the column
'''
Q1=data[col].quantile(0.25) # 25th quantile
Q3=data[col].quantile(0.75) # 75th quantile
IQR=Q3-Q1
Lower_Whisker = Q1 - 1.5*IQR
Upper_Whisker = Q3 + 1.5*IQR
data[col] = np.clip(data[col], Lower_Whisker, Upper_Whisker) # all the values smaller than Lower_Whisker will be assigned value of Lower_whisker
# and all the values above upper_whisker will be assigned value of upper_Whisker
return data
def treat_outliers_all(data, col_list):
'''
treat outlier in all numerical varaibles
col_list: list of numerical varaibles
data: data frame
'''
for c in col_list:
data = treat_outliers(data,c)
return data
numerical_col = customerdata.select_dtypes(include=np.number).columns.tolist()# getting list of numerical columns
# items to be treated
treating= {'Total_Ct_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Amt_Chng_Q4_Q1','Avg_Open_To_Buy','Credit_Limit','Customer_Age',
'Months_on_book','Months_Inactive_12_mon','Contacts_Count_12_mon'}
numerical_col = [ele for ele in numerical_col if ele in treating]
data = treat_outliers_all(customerdata,numerical_col)
# outlier detection using boxplot
numerical_col = customerdata.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(20,30))
for i, variable in enumerate(numerical_col):
plt.subplot(5,4,i+1)
plt.boxplot(customerdata[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
customerdata['Attrition_Flag'] = customerdata['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0}).astype(int)
customerdata.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.000 | M | 3 | High School | Married | $60K - $80K | Blue | 39.000 | 5 | 1.000 | 3.000 | 12691.000 | 777 | 11914.000 | 1.201 | 1144.000 | 42 | 1.172 | 0.061 |
| 1 | 0 | 49.000 | F | 5 | Graduate | Single | Less than $40K | Blue | 44.000 | 6 | 1.000 | 2.000 | 8256.000 | 864 | 7392.000 | 1.201 | 1291.000 | 33 | 1.172 | 0.105 |
| 2 | 0 | 51.000 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36.000 | 4 | 1.000 | 0.500 | 3418.000 | 0 | 3418.000 | 1.201 | 1887.000 | 20 | 1.172 | 0.000 |
| 3 | 0 | 40.000 | F | 4 | High School | NaN | Less than $40K | Blue | 34.000 | 3 | 4.000 | 1.000 | 3313.000 | 2517 | 796.000 | 1.201 | 1171.000 | 20 | 1.172 | 0.760 |
| 4 | 0 | 40.000 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21.000 | 5 | 1.000 | 0.500 | 4716.000 | 0 | 4716.000 | 1.201 | 816.000 | 28 | 1.172 | 0.000 |
customerdata.isna().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
customerdata.Education_Level.value_counts()
Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64
# Creating dummy variables for categorical variables
#customerdata = pd.get_dummies(data=customerdata, drop_first=True)
df = customerdata.copy()
X = df.drop(["Attrition_Flag"], axis=1)
y = df["Attrition_Flag"]
# Splitting data into training, validation and test sets:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
X_test.Education_Level.value_counts()
Graduate 651 High School 381 Uneducated 300 College 196 Post-Graduate 103 Doctorate 98 Name: Education_Level, dtype: int64
X_val.isna().sum()
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 294 Marital_Status 140 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# Let's impute the missing values
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# fit the imputer on train data and transform the train data
X_train["Education_Level"] = imp_mode.fit_transform(X_train[["Education_Level"]])
X_val["Education_Level"] = imp_mode.transform(X_val[["Education_Level"]])
X_test["Education_Level"] = imp_mode.transform(X_test[["Education_Level"]])
X_train["Marital_Status"] = imp_mode.fit_transform(X_train[["Marital_Status"]])
X_val["Marital_Status"] = imp_mode.transform(X_val[["Marital_Status"]])
X_test["Marital_Status"] = imp_mode.transform(X_test[["Marital_Status"]])
# transform the validation and test data using the imputer fit on train data
X_train.isna().sum()
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
X_val.isna().sum()
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
X_test.head()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9760 | 32.000 | M | 1 | High School | Single | $80K - $120K | Blue | 26.000 | 2 | 3.000 | 2.000 | 6407.000 | 1130 | 5277.000 | 0.756 | 8619.250 | 93 | 0.603 | 0.176 |
| 7413 | 50.000 | M | 1 | Post-Graduate | Single | $60K - $80K | Blue | 36.000 | 4 | 3.000 | 2.000 | 2317.000 | 0 | 2317.000 | 0.734 | 2214.000 | 41 | 0.519 | 0.000 |
| 6074 | 54.000 | F | 2 | High School | Married | $40K - $60K | Blue | 36.000 | 3 | 3.000 | 3.000 | 3892.000 | 0 | 3892.000 | 0.738 | 4318.000 | 74 | 0.762 | 0.000 |
| 3520 | 61.000 | M | 0 | Uneducated | Married | $120K + | Blue | 36.000 | 4 | 3.000 | 4.000 | 23836.250 | 2517 | 21655.000 | 0.424 | 1658.000 | 27 | 0.500 | 0.104 |
| 6103 | 41.000 | F | 3 | College | Married | $40K - $60K | Blue | 17.500 | 5 | 3.000 | 4.000 | 4312.000 | 2517 | 1795.000 | 0.741 | 2693.000 | 56 | 0.436 | 0.584 |
X_test.isna().sum()
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
X_val.Education_Level.value_counts()
Graduate 917 High School 404 Uneducated 306 College 199 Post-Graduate 101 Doctorate 99 Name: Education_Level, dtype: int64
# Creating dummy variables for categorical variables
X_train = pd.get_dummies(data=X_train, drop_first=True)
X_val = pd.get_dummies(data=X_val, drop_first=True)
X_test = pd.get_dummies(data=X_test, drop_first=True)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6075 entries, 800 to 4035 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 6075 non-null float64 1 Dependent_count 6075 non-null int64 2 Months_on_book 6075 non-null float64 3 Total_Relationship_Count 6075 non-null int64 4 Months_Inactive_12_mon 6075 non-null float64 5 Contacts_Count_12_mon 6075 non-null float64 6 Credit_Limit 6075 non-null float64 7 Total_Revolving_Bal 6075 non-null int64 8 Avg_Open_To_Buy 6075 non-null float64 9 Total_Amt_Chng_Q4_Q1 6075 non-null float64 10 Total_Trans_Amt 6075 non-null float64 11 Total_Trans_Ct 6075 non-null int64 12 Total_Ct_Chng_Q4_Q1 6075 non-null float64 13 Avg_Utilization_Ratio 6075 non-null float64 14 Gender_M 6075 non-null uint8 15 Education_Level_Doctorate 6075 non-null uint8 16 Education_Level_Graduate 6075 non-null uint8 17 Education_Level_High School 6075 non-null uint8 18 Education_Level_Post-Graduate 6075 non-null uint8 19 Education_Level_Uneducated 6075 non-null uint8 20 Marital_Status_Married 6075 non-null uint8 21 Marital_Status_Single 6075 non-null uint8 22 Income_Category_$40K - $60K 6075 non-null uint8 23 Income_Category_$60K - $80K 6075 non-null uint8 24 Income_Category_$80K - $120K 6075 non-null uint8 25 Income_Category_Less than $40K 6075 non-null uint8 26 Card_Category_Gold 6075 non-null uint8 27 Card_Category_Platinum 6075 non-null uint8 28 Card_Category_Silver 6075 non-null uint8 dtypes: float64(10), int64(4), uint8(15) memory usage: 800.9 KB
X_val.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 2894 to 6319 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null float64 1 Dependent_count 2026 non-null int64 2 Months_on_book 2026 non-null float64 3 Total_Relationship_Count 2026 non-null int64 4 Months_Inactive_12_mon 2026 non-null float64 5 Contacts_Count_12_mon 2026 non-null float64 6 Credit_Limit 2026 non-null float64 7 Total_Revolving_Bal 2026 non-null int64 8 Avg_Open_To_Buy 2026 non-null float64 9 Total_Amt_Chng_Q4_Q1 2026 non-null float64 10 Total_Trans_Amt 2026 non-null float64 11 Total_Trans_Ct 2026 non-null int64 12 Total_Ct_Chng_Q4_Q1 2026 non-null float64 13 Avg_Utilization_Ratio 2026 non-null float64 14 Gender_M 2026 non-null uint8 15 Education_Level_Doctorate 2026 non-null uint8 16 Education_Level_Graduate 2026 non-null uint8 17 Education_Level_High School 2026 non-null uint8 18 Education_Level_Post-Graduate 2026 non-null uint8 19 Education_Level_Uneducated 2026 non-null uint8 20 Marital_Status_Married 2026 non-null uint8 21 Marital_Status_Single 2026 non-null uint8 22 Income_Category_$40K - $60K 2026 non-null uint8 23 Income_Category_$60K - $80K 2026 non-null uint8 24 Income_Category_$80K - $120K 2026 non-null uint8 25 Income_Category_Less than $40K 2026 non-null uint8 26 Card_Category_Gold 2026 non-null uint8 27 Card_Category_Platinum 2026 non-null uint8 28 Card_Category_Silver 2026 non-null uint8 dtypes: float64(10), int64(4), uint8(15) memory usage: 267.1 KB
X_test.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 9760 to 413 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null float64 1 Dependent_count 2026 non-null int64 2 Months_on_book 2026 non-null float64 3 Total_Relationship_Count 2026 non-null int64 4 Months_Inactive_12_mon 2026 non-null float64 5 Contacts_Count_12_mon 2026 non-null float64 6 Credit_Limit 2026 non-null float64 7 Total_Revolving_Bal 2026 non-null int64 8 Avg_Open_To_Buy 2026 non-null float64 9 Total_Amt_Chng_Q4_Q1 2026 non-null float64 10 Total_Trans_Amt 2026 non-null float64 11 Total_Trans_Ct 2026 non-null int64 12 Total_Ct_Chng_Q4_Q1 2026 non-null float64 13 Avg_Utilization_Ratio 2026 non-null float64 14 Gender_M 2026 non-null uint8 15 Education_Level_Doctorate 2026 non-null uint8 16 Education_Level_Graduate 2026 non-null uint8 17 Education_Level_High School 2026 non-null uint8 18 Education_Level_Post-Graduate 2026 non-null uint8 19 Education_Level_Uneducated 2026 non-null uint8 20 Marital_Status_Married 2026 non-null uint8 21 Marital_Status_Single 2026 non-null uint8 22 Income_Category_$40K - $60K 2026 non-null uint8 23 Income_Category_$60K - $80K 2026 non-null uint8 24 Income_Category_$80K - $120K 2026 non-null uint8 25 Income_Category_Less than $40K 2026 non-null uint8 26 Card_Category_Gold 2026 non-null uint8 27 Card_Category_Platinum 2026 non-null uint8 28 Card_Category_Silver 2026 non-null uint8 dtypes: float64(10), int64(4), uint8(15) memory usage: 267.1 KB
y_train.value_counts()
0 5099 1 976 Name: Attrition_Flag, dtype: int64
y_test.value_counts()
0 1701 1 325 Name: Attrition_Flag, dtype: int64
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_val, model.predict(X_val))
score.append(scores)
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 79.71114599686028 Random forest: 75.40763997906855 GBM: 81.86185243328102 Adaboost: 81.34746206174779 Xgboost: 86.98587127158555 dtree: 78.38304552590267 Validation Performance: Bagging: 0.7944785276073619 Random forest: 0.7975460122699386 GBM: 0.8650306748466258 Adaboost: 0.8588957055214724 Xgboost: 0.8957055214723927 dtree: 0.803680981595092
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
**We will tune Xgboost , GBM and AdaBoost models using RandomizedSearchCV.
First let's create two functions to calculate different metrics and confusion matrix, so that we don't have to use the same code repeatedly for each model.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 10, 'reg_lambda': 10, 'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.1, 'gamma': 0} with CV score=0.9682260596546313:
Wall time: 1min 22s
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
gamma=0,
subsample=0.8,
learning_rate=0.1,
eval_metric="logloss",
max_depth=2,
reg_lambda=10,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.847 | 0.986 | 0.513 | 0.674 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.844 | 0.960 | 0.508 | 0.665 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in RandomizedSearchCV
param_grid1 = {
#Let's try different max_depth for base_estimator
"base_estimator":[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3)],
"n_estimators": np.arange(10,110,10),
"learning_rate":np.arange(0.1,2,0.1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
ada_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid1, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
ada_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(ada_tuned2.best_params_,ada_tuned2.best_score_))
Best parameters are {'n_estimators': 80, 'learning_rate': 0.7000000000000001, 'base_estimator': DecisionTreeClassifier(max_depth=2)} with CV score=0.8565201465201465:
Wall time: 1min 41s
# building model with best parameters
ada_tuned2 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
learning_rate=0.7000000000000001, n_estimators=80,
random_state=1)
# Fit the model on training data
ada_tuned2.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
learning_rate=0.7000000000000001, n_estimators=80,
random_state=1)
# Calculating different metrics on train set
adaboost_random_train = model_performance_classification_sklearn(
ada_tuned2, X_train, y_train
)
print("Training performance:")
adaboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.988 | 0.956 | 0.967 | 0.961 |
# Calculating different metrics on validation set
adaboost_random_val = model_performance_classification_sklearn(ada_tuned2, X_val, y_val)
print("Validation performance:")
adaboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.967 | 0.865 | 0.925 | 0.894 |
# creating confusion matrix
confusion_matrix_sklearn(ada_tuned2, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Parameter grid to pass in RandomizedSearchCV
param_grid2 = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
## add from article
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
gbc_tuned = RandomizedSearchCV(estimator=model, param_distributions=param_grid2, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
gbc_tuned.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(gbc_tuned.best_params_,gbc_tuned.best_score_))
Best parameters are {'subsample': 0.8, 'n_estimators': 250, 'max_features': 0.7} with CV score=0.8575614861329146:
Wall time: 4min 17s
# building model with best parameters
gbc_tuned = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1,
subsample=0.8, n_estimators=250,
max_features=0.7)
# Fit the model on training data
gbc_tuned.fit(X_train, y_train)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8)
# Calculating different metrics on train set
gbm_random_train = model_performance_classification_sklearn(
gbc_tuned, X_train, y_train
)
print("Training performance:")
gbm_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.989 | 0.952 | 0.981 | 0.966 |
# Calculating different metrics on validation set
gbc_random_val = model_performance_classification_sklearn(gbc_tuned, X_val, y_val)
print("Validation performance:")
gbc_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.972 | 0.890 | 0.935 | 0.912 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned, X_val, y_val)
print("Before UpSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_train==0)))
X_train, y_train
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
print("After UpSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_train_res==0)))
print('After UpSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
Before UpSampling, counts of label '1': 976 Before UpSampling, counts of label '0': 5099 After UpSampling, counts of label '1': 5099 After UpSampling, counts of label '0': 5099 After UpSampling, the shape of train_X: (10198, 29) After UpSampling, the shape of train_y: (10198,)
# Fit the model on training data
ada_tuned2.fit(X_train_res, y_train_res)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
learning_rate=0.7000000000000001, n_estimators=80,
random_state=1)
# Calculating different metrics on train set
adaboost_random_train_res = model_performance_classification_sklearn(
ada_tuned2, X_train_res, y_train_res
)
print("Training performance:")
adaboost_random_train_res
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.990 | 0.989 | 0.991 | 0.990 |
print("Before UpSampling, counts of label '1': {}".format(sum(y_val==1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_val==0)))
X_train, y_train
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_val_res, y_val_res = sm.fit_resample(X_val, y_val.ravel())
print("After UpSampling, counts of label '1': {}".format(sum(y_val_res==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_val_res==0)))
print('After UpSampling, the shape of train_X: {}'.format(X_val_res.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_val_res.shape))
Before UpSampling, counts of label '1': 326 Before UpSampling, counts of label '0': 1700 After UpSampling, counts of label '1': 1700 After UpSampling, counts of label '0': 1700 After UpSampling, the shape of train_X: (3400, 29) After UpSampling, the shape of train_y: (3400,)
# Calculating different metrics on validation set
adaboost_random_val_res = model_performance_classification_sklearn(ada_tuned2, X_val_res, y_val_res)
print("Validation performance:")
adaboost_random_val_res
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.977 | 0.972 | 0.981 | 0.977 |
# creating confusion matrix
confusion_matrix_sklearn(ada_tuned2, X_val_res, y_val_res)
# Fit the model on training data
gbc_tuned.fit(X_train_res, y_train_res)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8)
# Calculating different metrics on train set
gbm_random_train_res = model_performance_classification_sklearn(
gbc_tuned, X_train_res, y_train_res
)
print("Training performance:")
gbm_random_train_res
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.991 | 0.991 | 0.991 | 0.991 |
# Calculating different metrics on validation set
gbc_random_val_res = model_performance_classification_sklearn(gbc_tuned, X_val_res, y_val_res)
print("Validation performance:")
gbc_random_val_res
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.979 | 0.975 | 0.982 | 0.978 |
# Fit the model on training data
xgb_tuned2.fit(X_train_res, y_train_res)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train_res = model_performance_classification_sklearn(
xgb_tuned2, X_train_res, y_train_res
)
print("Training performance:")
xgboost_random_train_res
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.837 | 0.999 | 0.754 | 0.860 |
# Calculating different metrics on validation set
xgboost_random_val_res = model_performance_classification_sklearn(xgb_tuned2, X_val_res, y_val_res)
print("Validation performance:")
xgboost_random_val_res
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.834 | 0.998 | 0.752 | 0.858 |
# Downsampling training data
print("Before downsampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before downsampling, counts of label '0': {} \n".format(sum(y_train==0)))
cc = ClusterCentroids()
X_cc_train, y_cc_train = cc.fit_resample(X_train, y_train)
print("After downsampling, counts of label '1': {}".format(sum(y_cc_train ==1)))
print("After downsampling, counts of label '0': {} \n".format(sum(y_cc_train ==0)))
print('After downsampling, the shape of val_X: {}'.format(X_cc_train.shape))
print('After downsampling, the shape of val_y: {} \n'.format(y_cc_train.shape))
Before downsampling, counts of label '1': 976 Before downsampling, counts of label '0': 5099 After downsampling, counts of label '1': 976 After downsampling, counts of label '0': 976 After downsampling, the shape of val_X: (1952, 29) After downsampling, the shape of val_y: (1952,)
X_cc_train.shape
(1952, 29)
y_cc_train.shape
(1952,)
# Downsampling validation data
print("Before Downsampling, counts of label '1': {}".format(sum(y_val==1)))
print("Before Downsampling, counts of label '0': {} \n".format(sum(y_val==0)))
cc = ClusterCentroids()
X_cc_val, y_cc_val = cc.fit_resample(X_val, y_val)
print("After Downsampling, counts of label '1': {}".format(sum(y_cc_val==1)))
print("After Downsampling, counts of label '0': {} \n".format(sum(y_cc_val==0)))
print('After Downsampling, the shape of val_X: {}'.format(X_cc_val.shape))
print('After Downsampling, the shape of val_y: {} \n'.format(y_cc_val.shape))
Before Downsampling, counts of label '1': 326 Before Downsampling, counts of label '0': 1700 After Downsampling, counts of label '1': 326 After Downsampling, counts of label '0': 326 After Downsampling, the shape of val_X: (652, 29) After Downsampling, the shape of val_y: (652,)
# Fit the model on training data
ada_tuned2.fit(X_cc_train, y_cc_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
learning_rate=0.7000000000000001, n_estimators=80,
random_state=1)
# Calculating different metrics on train set
adaboost_random_train_down = model_performance_classification_sklearn(
ada_tuned2, X_cc_train, y_cc_train
)
print("Training performance:")
adaboost_random_train_down
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
adaboost_random_val_down= model_performance_classification_sklearn(ada_tuned2, X_cc_val, y_cc_val)
print("Validation performance:")
adaboost_random_val_down
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.979 | 0.985 | 0.973 | 0.979 |
# creating confusion matrix
confusion_matrix_sklearn(ada_tuned2, X_cc_val, y_cc_val)
# Fit the model on training data
gbc_tuned.fit(X_cc_train, y_cc_train)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8)
# Calculating different metrics on train set
gbm_random_train_down = model_performance_classification_sklearn(
gbc_tuned, X_cc_train, y_cc_train
)
print("Training performance:")
gbm_random_train_down
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
gbc_random_val_down = model_performance_classification_sklearn(gbc_tuned, X_cc_val, y_cc_val)
print("Validation performance:")
gbc_random_val_down
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.985 | 0.991 | 0.979 | 0.985 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned, X_cc_val, y_cc_val)
# Fit the model on training data
xgb_tuned2.fit(X_cc_train, y_cc_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train_down = model_performance_classification_sklearn(
xgb_tuned2, X_cc_train, y_cc_train
)
print("Training performance:")
xgboost_random_train_down
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.879 | 1.000 | 0.805 | 0.892 |
# Calculating different metrics on validation set
xgboost_random_val_down = model_performance_classification_sklearn(xgb_tuned2, X_cc_val, y_cc_val)
print("Validation performance:")
xgboost_random_val_down
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.876 | 1.000 | 0.801 | 0.889 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned, X_cc_val, y_cc_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
adaboost_random_train.T,
gbm_random_train.T,
xgboost_random_train.T,
adaboost_random_train_res.T,
gbm_random_train_res.T,
xgboost_random_train_res.T,
adaboost_random_train_down.T,
gbm_random_train_down.T,
xgboost_random_train_down.T
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned",
"Gradient Boost Tuned",
"Xgboost Tuned",
"AdaBoost Upsampled",
"Gradient Boost Upsampled",
"Xgboost Upsampled",
"AdaBoost Downsampled",
"Gradient Boost Downsampled",
"Xgboost Downsampled"
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost Tuned | Gradient Boost Tuned | Xgboost Tuned | AdaBoost Upsampled | Gradient Boost Upsampled | Xgboost Upsampled | AdaBoost Downsampled | Gradient Boost Downsampled | Xgboost Downsampled | |
|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.988 | 0.989 | 0.847 | 0.990 | 0.991 | 0.837 | 1.000 | 1.000 | 0.879 |
| Recall | 0.956 | 0.952 | 0.986 | 0.989 | 0.991 | 0.999 | 1.000 | 1.000 | 1.000 |
| Precision | 0.967 | 0.981 | 0.513 | 0.991 | 0.991 | 0.754 | 1.000 | 1.000 | 0.805 |
| F1 | 0.961 | 0.966 | 0.674 | 0.990 | 0.991 | 0.860 | 1.000 | 1.000 | 0.892 |
# Validation performance comparison
models_val_comp_df = pd.concat(
[
adaboost_random_val.T,
gbc_random_val.T,
xgboost_random_val.T,
adaboost_random_val_res.T,
gbc_random_val_res.T,
xgboost_random_val_res.T,
adaboost_random_val_down.T,
gbc_random_val_down.T,
xgboost_random_val_down.T
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned",
"Gradient Boost Tuned",
"Xgboost Tuned",
"AdaBoost Upsampled",
"Gradient Boost Upsampled",
"Xgboost Upsampled",
"AdaBoost Downsampled",
"Gradient Boost Downsampled",
"Xgboost Downsampled"
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| AdaBoost Tuned | Gradient Boost Tuned | Xgboost Tuned | AdaBoost Upsampled | Gradient Boost Upsampled | Xgboost Upsampled | AdaBoost Downsampled | Gradient Boost Downsampled | Xgboost Downsampled | |
|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.967 | 0.972 | 0.844 | 0.977 | 0.979 | 0.834 | 0.979 | 0.985 | 0.876 |
| Recall | 0.865 | 0.890 | 0.960 | 0.972 | 0.975 | 0.998 | 0.985 | 0.991 | 1.000 |
| Precision | 0.925 | 0.935 | 0.508 | 0.981 | 0.982 | 0.752 | 0.973 | 0.979 | 0.801 |
| F1 | 0.894 | 0.912 | 0.665 | 0.977 | 0.978 | 0.858 | 0.979 | 0.985 | 0.889 |
feature_names = X_train.columns
importances = gbc_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Downsampling Test data
print("Before Downsampling, counts of label '1': {}".format(sum(y_test==1)))
print("Before Downsampling, counts of label '0': {} \n".format(sum(y_test==0)))
cc = ClusterCentroids()
X_cc_test, y_cc_test = cc.fit_resample(X_test, y_test)
print("After Downsampling, counts of label '1': {}".format(sum(y_cc_test==1)))
print("After Downsampling, counts of label '0': {} \n".format(sum(y_cc_test==0)))
print('After Downsampling, the shape of val_X: {}'.format(X_cc_test.shape))
print('After Downsampling, the shape of val_y: {} \n'.format(y_cc_test.shape))
Before Downsampling, counts of label '1': 325 Before Downsampling, counts of label '0': 1701 After Downsampling, counts of label '1': 325 After Downsampling, counts of label '0': 325 After Downsampling, the shape of val_X: (650, 29) After Downsampling, the shape of val_y: (650,)
# Fit the model on downsampled test data
gbc_tuned.fit(X_cc_test, y_cc_test)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8)
# Calculating different metrics on test set
gbc_random_test_down = model_performance_classification_sklearn(gbc_tuned, X_cc_test, y_cc_test)
print("Test performance downsampled:")
gbc_random_test_down
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.997 | 1.000 | 0.994 | 0.997 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned, X_cc_test, y_cc_test)
# Fit the model on original test data
gbc_tuned.fit(X_test, y_test)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8)
# Calculating different metrics on test set
gbc_random_test_orig = model_performance_classification_sklearn(gbc_tuned, X_test, y_test)
print("Test performance original:")
gbc_random_test_orig
Test performance original:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned, X_test, y_test)
For categorical columns, we will do one hot encoding and missing value imputation as pre-processing
We are doing missing value imputation for the whole data, so that if there is any missing value in the data in future that can be taken care of.
# creating a list of numerical variables
numerical_features = ["Customer_Age", "Dependent_count", "Months_on_book", "Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon", "Credit_Limit", "Total_Revolving_Bal", "Avg_Open_To_Buy", "Total_Trans_Amt",
"Total_Trans_Ct", "Total_Ct_Chng_Q4_Q1", "Total_Amt_Chng_Q4_Q1","Avg_Utilization_Ratio" ]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category"
]
# creating a transformer for categorical variables, which will first apply simple imputer and
# then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = creditcardDF.drop("Attrition_Flag", axis=1)
Y = creditcardDF["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 20) (3039, 20)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"GB",
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250, random_state=1,
subsample=0.8,),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Dependent_count',
'Months_on_book',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_Trans_Amt',
'...
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Education_Level',
'Marital_Status',
'Income_Category',
'Card_Category'])])),
('GB',
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.7, n_estimators=250,
random_state=1, subsample=0.8))])